qPCR data QC

Author

Laura Vermeren & Symul

Published

May 7, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)

tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the data

Code
data_source <- "real"
qpcr_dir <- str_c(get_data_dir(data_source), "00 Raw/03 qPCR/")
file <- "VIBRANT_qPCR_data_merged_250416.csv"
cat("We load the file", file, "\n\n")
We load the file VIBRANT_qPCR_data_merged_250416.csv 
Code
qpcr <- read_csv(str_c(qpcr_dir, file))

The data is provided in long format:

Code
qpcr |> glimpse()
Rows: 51,480
Columns: 20
$ Data_row                 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ Well                     <chr> "A02", "A03", "A04", "A05", "A06", "A07", "A0…
$ Fluor                    <chr> "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy5", "Cy…
$ Content                  <chr> "Unkn-01", "Unkn-01", "Unkn-01", "Unkn-02", "…
$ Replicate                <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, …
$ Sample                   <chr> "A1", "A1", "A1", "A2", "A2", "A2", "A3", "A3…
$ Cq                       <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, …
$ `Starting Quantity (SQ)` <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ `Cq Mean`                <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.000…
$ plate_id                 <chr> "B064152", "B064152", "B064152", "B064152", "…
$ Plate_number             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ gDNA_Plate_ID            <dbl> 2240890, 2240890, 2240890, 2240890, 2240890, …
$ VMRC_Group               <chr> "VMRC_3", "VMRC_3", "VMRC_3", "VMRC_3", "VMRC…
$ Group_Fluor              <chr> "VMRC_3_Cy5", "VMRC_3_Cy5", "VMRC_3_Cy5", "VM…
$ Target                   <chr> "185329", "185329", "185329", "185329", "1853…
$ gDNA_Plate_Well          <chr> "2240890_A1", "2240890_A1", "2240890_A1", "22…
$ VIBR_Sample_ID           <chr> "2152456", "2152456", "2152456", "2157507", "…
$ Dilution_Factor          <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3…
$ Quant_Adjusted           <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+…
$ Copies_per_swab          <dbl> 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, 0.0e+00, …
  • Data_row is the row number
  • Well is the well number from A02 to P19
  • Fluor is the fluorescent dye used (Cy5, FAM, HEX)
  • Content ???
  • Replicate is the replicate number from 1 to 96 (with 495 missing values). Values are repeated 495, or 990 times

TODO: check what that is exactly

  • Sample is a sample ID — ?

  • Cq is the quantification cycle (Cq) value.

  • Starting Quantity (SQ) is ???

  • Cq Mean is the mean Cq value for the replicate ???

TODO: check what that is exactly

  • plate_id is the qPCR plate ID; there are 55 unique plate IDs in total.

  • Plate_number is the extraction plate number; there are 11 unique plate numbers in total (5 plate_id per Plate_number).

  • gDNA_Plate_ID is the gDNA plate ID; there are 11 unique gDNA plate IDs in total (match the Plate_number).

  • VMRC_Group is the VMRC group; there are 5 unique VMRC groups in total (VMRC_1, VMRC_2, VMRC_3, VMRC_4, VMRC_5).

  • Group_Fluor combines the Fluor and VMRC_Group columns

  • Target is the target “gene” (taxa/strain); there are 15 unique targets in total (122010, 185329, C0006A1, C0022A1, C0028A1, C0059E1, C0112A1, C0175A1, FF00004, FF00018, FF00051, FF00064, FF00072, UC101, UC119).

Code
qpcr |> 
  dplyr::count(Fluor, VMRC_Group, Group_Fluor, Target)
# A tibble: 15 × 5
   Fluor VMRC_Group Group_Fluor Target      n
   <chr> <chr>      <chr>       <chr>   <int>
 1 Cy5   VMRC_1     VMRC_1_Cy5  C0175A1  3432
 2 Cy5   VMRC_2     VMRC_2_Cy5  UC101    3432
 3 Cy5   VMRC_3     VMRC_3_Cy5  185329   3432
 4 Cy5   VMRC_4     VMRC_4_Cy5  FF00004  3432
 5 Cy5   VMRC_5     VMRC_5_Cy5  UC119    3432
 6 FAM   VMRC_1     VMRC_1_FAM  C0022A1  3432
 7 FAM   VMRC_2     VMRC_2_FAM  FF00018  3432
 8 FAM   VMRC_3     VMRC_3_FAM  C0028A1  3432
 9 FAM   VMRC_4     VMRC_4_FAM  C0006A1  3432
10 FAM   VMRC_5     VMRC_5_FAM  FF00064  3432
11 HEX   VMRC_1     VMRC_1_HEX  C0059E1  3432
12 HEX   VMRC_2     VMRC_2_HEX  FF00051  3432
13 HEX   VMRC_3     VMRC_3_HEX  C0112A1  3432
14 HEX   VMRC_4     VMRC_4_HEX  122010   3432
15 HEX   VMRC_5     VMRC_5_HEX  FF00072  3432

For now, we only have data for the 15 LBP strains.

  • gDNA_Plate_Well is the gDNA plate well; there are 1144 unique gDNA plate wells in total in the format [gDNA_Plate_ID]_[Well] (e.g., 2240890_A4).

  • VIBR_Sample_ID is the VIBRANT sample ID; there are 1016 unique VIBRANT sample IDs in total.

  • Dilution_Factor is the dilution factor; there are 4 unique dilution factors in total (5, 10, 20, 30).

  • Quant_Adjusted is the adjusted quantification value; they range from 0 to 3^{8}.

  • Copies_per_swab is the number of copies per swab; they range from 0 to 7.5^{10}.

Tidy names

We tidy the column names for consistency and readability.

Code
qpcr <- qpcr |> janitor::clean_names()
colnames(qpcr)
 [1] "data_row"             "well"                 "fluor"               
 [4] "content"              "replicate"            "sample"              
 [7] "cq"                   "starting_quantity_sq" "cq_mean"             
[10] "plate_id"             "plate_number"         "g_dna_plate_id"      
[13] "vmrc_group"           "group_fluor"          "target"              
[16] "g_dna_plate_well"     "vibr_sample_id"       "dilution_factor"     
[19] "quant_adjusted"       "copies_per_swab"     

Sample types and unique sample IDs

We define a sample_type variable to summarize information from vibr_sample_id and sample columns.

Code
qpcr <-  
  qpcr |> 
  mutate(
    sample_type = 
      case_when(
        str_detect(vibr_sample_id, "#N/A") & str_detect(sample, "std") ~ "Standard",
        str_detect(vibr_sample_id, "#N/A") & str_detect(sample, "water") ~ "Water",
        str_detect(vibr_sample_id, "EQ") ~ "Control sample",
        str_detect(vibr_sample_id, "empty") ~ "Empty",
        TRUE ~ "VIBRANT clinical sample"
      )
  )
Code
qpcr |> dplyr::count(sample_type) |> arrange(-n)
# A tibble: 5 × 2
  sample_type                 n
  <chr>                   <int>
1 VIBRANT clinical sample 43650
2 Standard                 3465
3 Control sample           1980
4 Empty                    1890
5 Water                     495
Code
tmp <- 
  qpcr |> 
  filter(sample_type == "VIBRANT clinical sample", target == target[1]) |>
  group_by(vibr_sample_id) |> 
  summarize(
    n_replicates = n(),
    n_plates = plate_number |> unique() |> length()
  )

# tmp |> dplyr::count(n_plates) # All samples's replicates are on a single plate

# tmp |> dplyr::count(n_replicates) # All samples have 3 replicates

We note that all samples’s replicates are on a single plate (plate_number or g_dna_plate_id); and all samples have the same number of replicates (3).

The plate layout is as follows:

Code
qpcr <- 
  qpcr |> 
  mutate(well_col = well |> str_sub(1, 1), well_row = well |> str_sub(2,3) |> as.integer()) |> 
  relocate(well_col, well_row, .after = well)
Code
qpcr |> 
  select(plate_number, well_col, well_row, sample_type, sample) |>
  distinct() |>
  ggplot() +
  aes(x =  well_row |> factor(), y = well_col |> factor() |> fct_rev(), fill = sample_type) +
  geom_tile() +
  geom_path(aes(group = sample), col = "black", alpha = 0.5) +
  geom_point() +
  facet_wrap(~ plate_number, labeller = label_both) +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

We also define a unique sample ID according to each sample type.

Code
qpcr <- 
  qpcr |> 
  mutate(
    sample_id = 
      case_when(
        sample_type == "VIBRANT clinical sample" ~ vibr_sample_id,
        sample_type == "Control sample" ~ vibr_sample_id,
        sample_type == "Empty" ~ str_c(vibr_sample_id,"_plate_", plate_number |> str_pad(width = 2, pad = 0),"_", sample),
        sample_type == "Water" ~ str_c("water_plate", plate_number |> str_pad(width = 2, pad = 0)),
        sample_type == "Standard" ~ str_c("std_", content |> str_remove("Std-"), "_plate_", plate_number |> str_pad(width = 2, pad = 0)),
        TRUE ~ NA_character_
      )
  ) |> 
  group_by(sample_id, target) |>
  arrange(well_col, well_row) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  mutate(uid = str_c(sample_id, "_r", replicate_nb))

Target’s plate layout and fluorescent probes

The plate x Fluorescence x Target layout is as follows:

Code
qpcr |> 
  dplyr::count(fluor, target, vmrc_group, plate_id, plate_number) |> 
  arrange(plate_number, vmrc_group, plate_id, fluor) |> 
  mutate(
    target = target |> fct_inorder(), 
    plate_nb = str_c("plate nb: ", plate_number) |> fct_inorder(),
    plate_id = plate_id |> fct_inorder()
    ) |> 
  ggplot() +
  aes(
    x = plate_id,
    y = target |> factor() |> fct_rev(),
    fill = fluor
  ) +
  geom_tile() +
  facet_grid(vmrc_group ~ plate_nb, scales = "free", space = "free") +
  xlab("Plate ID") +
  ylab("Target") +
  scale_fill_manual(
    name = "Fluor.",
    values = c("FAM" = "dodgerblue1", "HEX" = "green3", "Cy5" = "#F8766D")
  ) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

Potential batch effects due to extraction should be checked for plate_number and those due to the qPCR itself using the plate_id.

Dilution factors ?

Code
qpcr |> 
  dplyr::count(target, dilution_factor) 
# A tibble: 36 × 3
   target  dilution_factor     n
   <chr>             <dbl> <int>
 1 122010                5  2808
 2 122010               10   312
 3 122010               20   312
 4 185329               20  3120
 5 185329               30   312
 6 C0006A1               5  2808
 7 C0006A1              10   312
 8 C0006A1              20   312
 9 C0022A1               5  3120
10 C0022A1              10   312
# ℹ 26 more rows

Exploratory & QC analyses

Total number of copies per swab per sample type

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = sample_type, fill = sample_type) +
  geom_boxplot(alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = target, y = copies_per_swab, col = sample_type, fill = sample_type) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(sample_type ~ ., scales = "free") +
  guides(fill = "none", color = "none") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

g 

Code
g + scale_y_log10()

Code
g <- 
  qpcr |> 
  ggplot() +
  aes(x = plate_number |> factor(), y = copies_per_swab, col = target, fill = target) +
  geom_boxplot(alpha = 0.5) +
  facet_grid(sample_type ~ ., scales = "free") +
  xlab("Plate number") +
  theme(
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90, hjust = 0)
  )

g 

Code
g + scale_y_log10() 

Empty and water samples

From the plot above, it looks like the empty and water samples have non-zero values on a few plates.

Code
plot_qpcr_empties <- function(qpcr, color_by = "sample_type") {
  qpcr |> 
    filter(sample_type %in% c("Empty", "Water")) |> 
    group_by(sample, plate_id, plate_number, target) |>
    mutate(replicate_nb = row_number()) |> 
    ungroup() |> 
    ggplot() +
    aes(x = sample, y = copies_per_swab, label = replicate_nb, col = !!sym(color_by)) +
    geom_text(size = 3) +
    facet_grid(target ~ plate_number, scales = "free_x", space = "free_x") +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      strip.text.y = element_text(angle = 0)
    ) 
}
Code
plot_qpcr_empties(qpcr, color_by = "sample_type") 

Code
plot_qpcr_empties(qpcr |> mutate(zero_copies = (copies_per_swab == 0)), color_by = "zero_copies") 

Code
prob_target <-
  qpcr |> 
  select(target) |> 
  distinct() |> 
  mutate(prob_target = (target %in% c("C0022A1", "C0059E1", "C0175A1")))
Warning

We may have to check the values for targets C0175A1, C0022A1, C0059E1.

Code
prob_target <- 
  prob_target |> 
  mutate(prob_target = if_else(prob_target, "x", "v")) 

qpcr <- 
  qpcr |> 
  left_join(prob_target, by = "target") 

Control samples

Code
qpcr |> 
  filter(sample_type %in% c("Control sample")) |> 
  group_by(sample, plate_id, plate_number, target) |>
  mutate(replicate_nb = row_number()) |> 
  ungroup() |> 
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  facet_grid(prob_target + target ~ plate_number, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  ) 

From these plots, I assume that some of these controls are negative controls and others are positive controls.

TODO: check if they match the sampleID from the metagenomics data or if we need a manifest.

Replicability looks good, but we note the same issues with the problematic targets in the (likely) negative controls as in the empty samples.

Standards

TODO

Clinical samples

Code
qpcr |> 
  filter(sample_type %in% c("VIBRANT clinical sample")) |> 
  group_by(vibr_sample_id, sample, plate_id, plate_number, target) |>
  mutate(
    replicate_nb = row_number(),
    median_cps = median(copies_per_swab, na.rm = TRUE),
    mean_cps = mean(copies_per_swab, na.rm = TRUE)
    ) |> 
  ungroup() |> 
  mutate(vibr_sample_id = vibr_sample_id |> fct_reorder(mean_cps)) |>
  ggplot() +
  aes(x = vibr_sample_id, y = copies_per_swab, label = replicate_nb, col = replicate_nb |> factor()) +
  geom_line(aes(group = vibr_sample_id), col = "black", linewidth = 0.1) +
  geom_text(size = 3) +
  scale_y_log10() +
  scale_color_discrete("replicate") +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(prob_target + target ~ plate_number, scales = "free_x", space = "free_x") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

We notice the same issues with the problematic targets in plate 1-3 (bottom, left).

These three targets are from the same plates (plate_id column):

Code
qpcr |> 
  filter(prob_target == "x", plate_number %in% 1:3) |> 
  select(target, fluor, plate_id, plate_number) |> 
  distinct() |> 
  gt()
target fluor plate_id plate_number
C0175A1 Cy5 C115281 1
C0022A1 FAM C115281 1
C0059E1 HEX C115281 1
C0175A1 Cy5 C115282 2
C0022A1 FAM C115282 2
C0059E1 HEX C115282 2
C0175A1 Cy5 C115283 3
C0022A1 FAM C115283 3
C0059E1 HEX C115283 3
Code
qpcr |> 
  filter(prob_target == "x", plate_number %in% 1:3) |> 
  select(target, fluor, plate_id, plate_number, starts_with("well"), sample, vibr_sample_id, sample_type, copies_per_swab) |> 
  arrange(sample_type) |>
  ggplot() +
  aes(x = well_row |> factor(), y = well_col |> fct_rev(), fill = sample_type) +
  geom_tile() +
  geom_point(aes(size = copies_per_swab |> log10())) +
  geom_line(aes(group = sample), col = "black", alpha = 0.5) +
  facet_grid(target ~ plate_number) +
  coord_fixed() +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

Code
qpcr |> 
  filter(prob_target == "x", plate_number %in% 1:4) |> 
  select(target, fluor, plate_id, plate_number, starts_with("well"), sample, vibr_sample_id, sample_type, dilution_factor, copies_per_swab) |> 
  arrange(sample_type) |>
  ggplot() +
  aes(x = well_row |> factor(), y = well_col |> fct_rev(), fill = sample_type) +
  geom_tile(aes(alpha = dilution_factor)) +
  geom_point(aes(size = copies_per_swab |> log10())) +
  geom_line(aes(group = sample), col = "black", alpha = 0.5) +
  facet_grid(target ~ plate_number) +
  coord_fixed() +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

There isn’t a clear “geometrical” pattern on the plate, nor a clear dilution factor association. Let’s just assume that these three plates failed.

Warning

We could simply discard these values (exclude these targets x plates) or take the minimum when aggregating across replicates (probably not a good idea for the remaining good plates).

Code
qpcr |> 
  filter(prob_target == "x", plate_number %in% 1:4) |> 
  select(target, fluor, plate_id, plate_number, starts_with("well"), sample, vibr_sample_id, sample_type, dilution_factor, copies_per_swab, cq) |> 
  arrange(sample_type) |>
  ggplot() +
  aes(x = well_row |> factor(), y = well_col |> fct_rev(), fill = sample_type) +
  geom_tile(aes(alpha = dilution_factor)) +
  geom_point(aes(size = cq |> log10())) +
  geom_line(aes(group = sample), col = "black", alpha = 0.5) +
  facet_grid(target ~ plate_number) +
  coord_fixed() +
  xlab("Well column") + ylab("Well row") +
  labs(caption = "Black lines connect replicates") 

Creating SummarizedExperiment objects

We create two Summarized Experiment objects:

  • one with all values as provided in the original file, where each “sample” is a plate well and each feature is a target (LBP taxa), with the following assays

    • cq values (cq)
    • starting_quantity_sq
    • cq_mean
    • quant_adjusted
    • copies_per_swab_raw (current copies_per_swab column)
    • copies_per_swab (where “problematic” targets x plates are set to NA)

    The colData contains the following columns:

      + `uid`
      + `sample_id`
      + `sample`
      + `vibr_sample_id`
      + `sample_type`
      + `plate_number` 
      + `g_dna_plate_id`
      + (`dilution_factor`)

    The rowData contains the following columns:

      + `fluor`
      + `plate_ids` (the concatenation of `plate_id` for each `plate_number`)
      + `valid_plates` (the plate_number where signal looked good)
  • one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays

    • copies_per_swab_med (the median copies_per_swab across replicates)
    • copies_per_swab_mean (the mean copies_per_swab across replicates)
    • copies_per_swab_cv (the coefficient of variation of the copies_per_swab across replicates)

    The colData contains the following columns:

      + `sample_type`
      + `vibr_sample_id`
      + `sample`
      + `plate_number` 
      + `g_dna_plate_id`
      + `dilution_factor`

    The rowData contains the following columns:

      + `fluor`
      + `vmrc_group`
      + `plate_ids` (the concatenation of `plate_id` for each `plate_number`)
      + `valid_plates` (the plate_number where signal looked good)

Raw SummarizedExperiment object

We first identify potential issues with qPCR plates if the copies_per_swab values are not zero for the empty and water wells.

Code
map(
  qpcr$vmrc_group |> unique(),
  ~{
    qpcr |> 
      filter(vmrc_group == .x) |> 
      filter(
        sample_type %in% c("Empty", "Water"), 
        str_detect(sample_id, "G12")|str_detect(sample_id, "H12")|str_detect(sample_id, "water")
        ) |>
      ggplot() +
      aes(x = target, y = copies_per_swab, col = target, fill = target) +
      # geom_boxplot(alpha = 0.5, outlier.shape = NA) +
      geom_point(aes(shape = sample_type), alpha = 0.3) +
      # ggbeeswarm::geom_quasirandom(alpha = 0.5) +
      scale_y_log10() + expand_limits(y = 1) +
      facet_wrap(. ~ plate_number + plate_id, nrow = 2, scales = "free_x") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      guides(col = "none", fill = "none") +
      ggtitle(str_c("qPCR plates for targets of group ", .x)) 
  }
)
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]

Plates are flagged as non-valid if there are more than 25% of the empty or water samples with non-zero values.

Code
valid_plates <- 
  qpcr |> 
  filter(
    sample_type %in% c("Empty", "Water"), 
    str_detect(sample_id, "G12")|str_detect(sample_id, "H12")|str_detect(sample_id, "water")
  ) |>
  group_by(plate_number, plate_id, vmrc_group) |>
  summarize(
    n = n(),
    n_non_zero = sum(copies_per_swab > 0, na.rm = TRUE),
    .groups = "drop"
  ) |> 
  mutate(
    f_non_zero = n_non_zero / n,
    valid = (f_non_zero <= 1/4) & (n_non_zero <= 2)
  ) 
  
valid_plates |> filter(!valid) |> gt(caption = "qPCR plates flagged as non-valid.")
qPCR plates flagged as non-valid.
plate_number plate_id vmrc_group n n_non_zero f_non_zero valid
1 C115281 VMRC_1 27 16 0.5925926 FALSE
2 C115282 VMRC_1 27 17 0.6296296 FALSE
3 C115283 VMRC_1 27 14 0.5185185 FALSE
Code
qpcr <- 
  qpcr |> 
  left_join(
    valid_plates |> 
      dplyr::rename(valid_qpcr_plate = valid) |>
      select(plate_id, valid_qpcr_plate) |> 
      distinct(),
    by = join_by(plate_id)
  )
Code
make_raw_qpcr_SE <- function(qpcr){
  
  qpcr <- 
    qpcr |> 
    arrange(plate_number, well_col, well_row) |>
    mutate(uid = uid |> fct_inorder()) |> 
    arrange(vmrc_group, fluor) |> 
    mutate(target = target |> fct_inorder()) |> 
    arrange(uid, target)
    
  
  cq_assay <- 
    qpcr |>
    select(uid, target, cq) |> 
    arrange() |> 
    pivot_wider(names_from = uid, values_from = cq) |> 
    as.data.frame() |> 
    column_to_rownames("target") 
  
  starting_quantity_sq_assay <- 
    qpcr |>
    select(uid, target, starting_quantity_sq) |> 
    arrange() |> 
    pivot_wider(names_from = uid, values_from = starting_quantity_sq) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
   cq_mean_assay <- 
    qpcr |>
    select(uid, target, cq_mean) |> 
    arrange() |> 
    pivot_wider(names_from = uid, values_from = cq_mean) |> 
    as.data.frame() |> 
    column_to_rownames("target")
   
   quant_adjusted_assay <- 
     qpcr |>
     select(uid, target, quant_adjusted) |> 
     arrange() |> 
     pivot_wider(names_from = uid, values_from = quant_adjusted) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
   copies_per_swab_raw_assay <- 
     qpcr |>
     select(uid, target, copies_per_swab) |> 
     arrange() |> 
     pivot_wider(names_from = uid, values_from = copies_per_swab) |> 
     as.data.frame() |> 
     column_to_rownames("target")
   
  copies_per_swab_assay <- 
    qpcr |>
    mutate(copies_per_swab_valid = ifelse(valid_qpcr_plate, copies_per_swab, NA)) |> 
    select(uid, target, copies_per_swab_valid) |> 
    arrange() |> 
    pivot_wider(names_from = uid, values_from = copies_per_swab_valid) |> 
    as.data.frame() |> 
    column_to_rownames("target")
  
   coldata <- 
     qpcr |> 
     select(uid, sample_id, replicate_nb, sample, vibr_sample_id, sample_type, plate_number, g_dna_plate_id) |> 
     distinct() |> 
     arrange(uid) |> 
     as.data.frame() |> 
     mutate(rownames = uid) |> 
     column_to_rownames("rownames")
   
   rowdata <-
     qpcr |> 
     select(target, fluor, vmrc_group, plate_id, valid_qpcr_plate, plate_number) |> 
     distinct() |> 
     arrange(target, plate_number) |> 
     group_by(target) |> 
     mutate(
       plate_id = str_c(plate_id, " (gDNA plate ", plate_number |> str_pad(width = 2, pad = "0"), ")", sep = ""),
     ) |> 
     summarize(
       fluor = fluor |> unique() |> str_c(collapse = ", "),
       vmrc_group = vmrc_group |> unique() |> str_c(collapse = ", "),
       plate_ids = plate_id |> unique() |> str_c(collapse = ", "),
       non_valdid_plates = plate_id[!valid_qpcr_plate] |> unique() |> str_c(collapse = ", ")
     ) |> 
     ungroup() |> 
     as.data.frame() |> 
     mutate(rownames = target) |> 
     column_to_rownames("rownames")
   
   SE <- SummarizedExperiment(
     assays = list(
       cq = cq_assay |> as.matrix(),
       starting_quantity_sq = starting_quantity_sq_assay |> as.matrix(),
       cq_mean = cq_mean_assay |> as.matrix(),
       quant_adjusted = quant_adjusted_assay |> as.matrix(),
       copies_per_swab_raw = copies_per_swab_raw_assay |>  as.matrix(),
       copies_per_swab = copies_per_swab_assay |> as.matrix()
     ),
     colData = coldata,
     rowData = rowdata
   )
   
   SE
  
}
Code
SE_qPCR_raw <- make_raw_qpcr_SE(qpcr)

Aggregated SummarizedExperiment object

  • one with summarized values (aggregated across replicates) where each “sample” is a VIBRANT sample ID (including control samples) and each feature is a target (LBP taxa), with the following assays

    • copies_per_swab_med (the median copies_per_swab across replicates)
    • copies_per_swab_mean (the mean copies_per_swab across replicates)
    • copies_per_swab_cv (the coefficient of variation of the copies_per_swab across replicates)

    The colData contains the following columns:

      + `sample_type`
      + `vibr_sample_id`
      + `sample`
      + `plate_number` 
      + `g_dna_plate_id`
      + ? `dilution_factor`

    The rowData contains the following columns:

      + `fluor`
      + `vmrc_group`
      + `plate_ids` (the concatenation of `plate_id` for each `plate_number`)
      + `valid_plates` (the plate_number where signal looked good)
Code
make_agg_qpcr_SE <- function(qpcr){
  
  qpcr <- 
    qpcr |> 
    filter(sample_type %in% c("VIBRANT clinical sample", "Control sample")) |>
    arrange(plate_number, well_col, well_row) |>
    mutate(uid = uid |> fct_inorder(), sample_id = sample_id |> fct_inorder()) |> 
    arrange(vmrc_group, fluor) |> 
    mutate(target = target |> fct_inorder()) |> 
    arrange(uid, target)
    
  summarized_qpcr <- 
    qpcr |>
    group_by(sample_id, target) |> 
    summarize(
      copies_per_swab_med = median(copies_per_swab),
      copies_per_swab_mean = mean(copies_per_swab),
      copies_per_swab_sd = sd(copies_per_swab),
      .groups = "drop"
    ) |> 
    mutate(
      copies_per_swab_cv = ifelse(copies_per_swab_sd == 0, 0, copies_per_swab_sd / copies_per_swab_mean)
    ) 
  
  coldata <- 
    qpcr |> 
    select(sample_id, vibr_sample_id, sample, sample_type, plate_number, g_dna_plate_id) |>
    distinct() |> 
    arrange(sample_id) |> 
    left_join(
      qpcr |> 
        select(sample_id, plate_id, valid_qpcr_plate) |> 
        group_by(sample_id) |> 
        summarize(
          plate_ids = str_c(plate_id, collapse = ", "),
          non_valid_plate_ids = str_c(plate_id[!valid_qpcr_plate], collapse = ", ")
        ),
      by = join_by(sample_id)
    ) |> 
    as.data.frame() |> 
    mutate(rownames = sample_id) |> 
    column_to_rownames("rownames")
   
   rowdata <-
     qpcr |> 
     select(target, fluor, vmrc_group, plate_id, plate_number) |> 
     distinct() |> 
     arrange(target, plate_number) |> 
     group_by(target) |> 
     mutate(
       plate_id = str_c(plate_id, " (gDNA plate ", plate_number |> str_pad(width = 2, pad = "0"), ")", sep = ""),
     ) |> 
     summarize(
       fluor = fluor |> unique() |> str_c(collapse = ", "),
       vmrc_group = vmrc_group |> unique() |> str_c(collapse = ", "),
       plate_ids = plate_id |> unique() |> str_c(collapse = ", ")
     ) |> 
     ungroup() |> 
     as.data.frame() |> 
     mutate(rownames = target) |> 
     column_to_rownames("rownames") 
   
   SE <- SummarizedExperiment(
     assays = list(
       copies_per_swab_med = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = sample_id, values_from = copies_per_swab_med) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_mean = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = sample_id, values_from = copies_per_swab_mean) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix(),
       copies_per_swab_cv = 
         summarized_qpcr |> 
         pivot_wider(id_cols = target, names_from = sample_id, values_from = copies_per_swab_cv) |> 
         as.data.frame() |> 
         column_to_rownames("target") |> 
         as.matrix()
     ),
     colData = coldata,
     rowData = rowdata
   )
   
   SE
     
}
Code
SE_qPCR_agg <- make_agg_qpcr_SE(qpcr)

Save SummarizedExperiment objects

Save the SE objects to disk

Code
saveRDS(
  SE_qPCR_raw, 
  str_c(
    get_output_dir(data_source = data_source),  
    "03_se_pcr_raw_", today() |> str_remove_all("-"), ".rds"
    )
  )


saveRDS(
  SE_qPCR_agg, 
  str_c(
    get_output_dir(data_source = data_source),  
    "03_se_pcr_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )